import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio import features
import matplotlib.pyplot as plt
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, accuracy_score, classification_report
from pathlib import Path
from IPython.display import display
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.offline
plotly.offline.init_notebook_mode()
print('All libraries successfully imported!')
print(f'Scikit-learn: {sklearn.__version__}')
All libraries successfully imported! Scikit-learn: 0.24.2
computer_path = '/export/miro/ndeffense/LBRAT2104/'
grp_letter = 'X'
lut_path = f'{computer_path}data/LUT/'
# Directory for all work files
work_path = f'{computer_path}GROUP_{grp_letter}/WORK/'
in_situ_path = f'{work_path}IN_SITU/'
classif_path = f'{work_path}CLASSIF/'
am_path = f'{work_path}ACCURACY_METRICS/'
Path(am_path).mkdir(parents=True, exist_ok=True)
print(f'Accuracy Metrics path is set to : {am_path}')
Accuracy Metrics path is set to : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/ACCURACY_METRICS/
site = 'NAMUR'
year = '2020'
no_data = 0
field_classif_code = 'sub_nb'
field_classif_name = 'sub'
s4s_lut_xlsx = f'{lut_path}crop_dictionary_new.xlsx'
in_situ_val_shp = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_VAL.shp'
in_situ_val_tif = f'{in_situ_path}{site}_{year}_IN_SITU_ROI_VAL.tif'
classif_tif = f'{classif_path}{site}_{year}_CLASSIF_RF_with_NDVI.tif'
cm_csv = f'{am_path}{site}_{year}_CM.csv'
cm_png = f'{am_path}{site}_{year}_CM.png'
cm_html = f'{am_path}{site}_{year}_CM.html'
print(f'Raster template file : {classif_tif}')
# Open the shapefile with GeoPandas
in_situ_gdf = gpd.read_file(in_situ_val_shp)
# Open the raster file you want to use as a template for rasterize
src = rasterio.open(classif_tif, "r")
# Update metadata
out_meta = src.meta
out_meta.update(nodata=no_data)
crs_shp = str(in_situ_gdf.crs).split(":",1)[1]
crs_tif = str(src.crs).split(":",1)[1]
print(f'The CRS of in situ data is : {crs_shp}')
print(f'The CRS of raster template is : {crs_tif}')
if crs_shp == crs_tif:
print("CRS are the same")
print(f'Rasterize starts : {in_situ_val_shp}')
# Burn the features into the raster and write it out
dst = rasterio.open(in_situ_val_tif, 'w+', **out_meta)
dst_arr = dst.read(1)
# This is where we create a generator of geom, value pairs to use in rasterizing
geom_col = in_situ_gdf.geometry
code_col = in_situ_gdf[field_classif_code].astype(int)
shapes = ((geom,value) for geom, value in zip(geom_col, code_col))
in_situ_arr = features.rasterize(shapes=shapes,
fill=no_data,
out=dst_arr,
transform=dst.transform)
dst.write_band(1, in_situ_arr)
print(f'Rasterize is done : {in_situ_val_tif}')
# Close rasterio objects
src.close()
dst.close()
else:
print('CRS are different --> repoject in-situ data shapefile with "to_crs"')
Raster template file : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/CLASSIF/NAMUR_2020_CLASSIF_RF_with_NDVI.tif The CRS of in situ data is : 32631 The CRS of raster template is : 32631 CRS are the same Rasterize starts : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_VAL.shp Rasterize is done : /export/miro/ndeffense/LBRAT2104/GROUP_X/WORK/IN_SITU/NAMUR_2020_IN_SITU_ROI_VAL_1.tif
y_pred and y_true¶# Open in-situ used for validation
src = rasterio.open(in_situ_val_tif, "r")
val_arr = src.read(1)
src.close()
# Open classification map
src = rasterio.open(classif_tif, "r")
classif_arr = src.read(1)
src.close()
# Get the postion of validation pixels
idx = np.where(val_arr == no_data, 0, 1).astype(bool)
# Ground truth (correct) target values
y_true = val_arr[idx]
# Estimated targets as returned by a classifier.
y_pred = classif_arr[idx]
labels_all = np.concatenate((y_true, y_pred))
labels_unique = np.unique(labels_all)
print(labels_all)
print(labels_unique)
[3199 3199 3199 ... 1111 1111 1111] [1111 1121 1152 1171 1192 1435 1511 1771 1811 1911 1923 3199 4111 6999 8111 8411]
lut_df = pd.read_excel(s4s_lut_xlsx)
classes = sorted(list(dict.fromkeys(y_true)))
classes_name = lut_df[lut_df[field_classif_code].isin(classes)].sort_values(field_classif_code)[field_classif_name].to_list()
for code,name in zip(classes, classes_name):
print(f'{code} - {name}')
1111 - Winter wheat 1121 - Maize 1152 - Barley six-row 1171 - Oats 1192 - Other cereals 1435 - Rapeseed 1511 - Potatoes 1771 - Peas 1811 - Sugar beet 1911 - Alfalfa 1923 - Flax, hemp and other similar crops 3199 - Grassland and meadows 4111 - Fallows 1 year 6999 - Forest 8111 - Urban 8411 - Greenhouses
cm = confusion_matrix(y_true, y_pred)
# Convert CM into panda dataframe and save it
cm_df = pd.DataFrame(cm)
cm_df.to_csv(cm_csv, index=False, sep=';')
cm_values = cm_df.values
cm_df.columns = labels_unique
cm_df.index = labels_unique
display(cm_df)
| 1111 | 1121 | 1152 | 1171 | 1192 | 1435 | 1511 | 1771 | 1811 | 1911 | 1923 | 3199 | 4111 | 6999 | 8111 | 8411 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1111 | 18463 | 5 | 2077 | 130 | 1237 | 58 | 0 | 0 | 0 | 0 | 114 | 65 | 4 | 1 | 64 | 9 |
| 1121 | 0 | 5026 | 0 | 1 | 0 | 12 | 266 | 0 | 1061 | 0 | 0 | 112 | 0 | 0 | 28 | 0 |
| 1152 | 214 | 0 | 4307 | 241 | 7 | 14 | 0 | 0 | 0 | 0 | 0 | 239 | 0 | 0 | 1 | 0 |
| 1171 | 75 | 3 | 0 | 203 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | 0 | 1 | 0 | 0 | 0 |
| 1192 | 3307 | 64 | 8 | 132 | 667 | 0 | 0 | 9 | 0 | 0 | 165 | 222 | 0 | 0 | 4 | 13 |
| 1435 | 23 | 0 | 7 | 0 | 0 | 423 | 0 | 0 | 0 | 0 | 0 | 38 | 0 | 0 | 7 | 0 |
| 1511 | 0 | 1969 | 0 | 0 | 0 | 0 | 2859 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 8 | 0 |
| 1771 | 7 | 0 | 0 | 0 | 0 | 0 | 2 | 892 | 0 | 0 | 2 | 0 | 0 | 0 | 125 | 0 |
| 1811 | 2 | 153 | 0 | 0 | 0 | 0 | 238 | 0 | 5190 | 0 | 0 | 9 | 4 | 0 | 0 | 0 |
| 1911 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 151 | 0 | 0 | 0 | 0 |
| 1923 | 12 | 0 | 0 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 897 | 1 | 0 | 0 | 7 | 0 |
| 3199 | 5 | 0 | 13 | 4 | 0 | 4 | 0 | 1 | 0 | 18 | 0 | 17384 | 3 | 340 | 152 | 1 |
| 4111 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 0 | 0 | 0 | 0 | 44 | 2 | 0 | 0 | 0 |
| 6999 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | 0 | 702 | 38 | 0 |
| 8111 | 8 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 502 | 0 |
| 8411 | 0 | 30 | 0 | 0 | 0 | 0 | 0 | 66 | 6 | 0 | 0 | 0 | 0 | 0 | 57 | 0 |
# invert z idx values
z = cm[::-1]
x = classes_name
y = x[::-1].copy() # invert idx values of x
# change each element of z to type string for annotations
z_text = [[str(y) for y in x] for x in z]
#print(z_text)
# set up figure
fig = ff.create_annotated_heatmap(z,
x=x,
y=y,
annotation_text=z_text,
colorscale='spectral',
reversescale=True) # RdYlGn
# add title
fig.update_layout(title_text=f"Confusion Matrix - {site}, {year}",
#xaxis = dict(title='x'),
#yaxis = dict(title='x')
)
# adjust margins to make room for yaxis title
fig.update_layout(margin=dict(t=200, l=200))
fig.update_xaxes(tickfont_size=20)
fig.update_yaxes(tickfont_size=20)
fig.update_layout(font_size=25)
# add colorbar
fig['data'][0]['showscale'] = True
fig.show()
fig.write_html(cm_html)
acc_metrics_str = classification_report(y_true, y_pred,target_names=classes_name, digits=3)
acc_metrics_dict = classification_report(y_true, y_pred,target_names=classes_name, output_dict=True)
am_df = pd.DataFrame.from_dict(acc_metrics_dict).round(3)
print(acc_metrics_str)
precision recall f1-score support
Winter wheat 0.835 0.831 0.833 22227
Maize 0.693 0.773 0.731 6506
Barley six-row 0.672 0.857 0.753 5023
Oats 0.280 0.710 0.402 286
Other cereals 0.349 0.145 0.205 4591
Rapeseed 0.828 0.849 0.838 498
Potatoes 0.849 0.591 0.697 4838
Peas 0.921 0.868 0.894 1028
Sugar beet 0.829 0.927 0.876 5596
Alfalfa 0.000 0.000 0.000 151
Flax, hemp and other similar crops 0.759 0.963 0.849 931
Grassland and meadows 0.950 0.970 0.960 17925
Fallows 1 year 0.143 0.041 0.063 49
Forest 0.672 0.902 0.771 778
Urban 0.506 0.980 0.667 512
Greenhouses 0.000 0.000 0.000 159
accuracy 0.809 71098
macro avg 0.580 0.651 0.596 71098
weighted avg 0.798 0.809 0.797 71098
oa = am_df['accuracy'][0]*100
print(f'Overall Accuracy : {oa}%')
Overall Accuracy : 80.9%
Some labels in y_test don't appear in y_pred. Specifically in this case, label '2' is never predicted:
classes_missing = set(y_true) - set(y_pred)
print(f'{len(classes_missing)} classes are missing in the classification (y_pred) : {classes_missing} \n')
acc_metrics_all = classification_report(y_true, y_pred, target_names=classes_name, labels=labels_unique, digits=3)
print(acc_metrics_all)
0 classes are missing in the classification (y_pred) : set()
precision recall f1-score support
Winter wheat 0.835 0.831 0.833 22227
Maize 0.693 0.773 0.731 6506
Barley six-row 0.672 0.857 0.753 5023
Oats 0.280 0.710 0.402 286
Other cereals 0.349 0.145 0.205 4591
Rapeseed 0.828 0.849 0.838 498
Potatoes 0.849 0.591 0.697 4838
Peas 0.921 0.868 0.894 1028
Sugar beet 0.829 0.927 0.876 5596
Alfalfa 0.000 0.000 0.000 151
Flax, hemp and other similar crops 0.759 0.963 0.849 931
Grassland and meadows 0.950 0.970 0.960 17925
Fallows 1 year 0.143 0.041 0.063 49
Forest 0.672 0.902 0.771 778
Urban 0.506 0.980 0.667 512
Greenhouses 0.000 0.000 0.000 159
accuracy 0.809 71098
macro avg 0.580 0.651 0.596 71098
weighted avg 0.798 0.809 0.797 71098
If you decide that you are not interested in the scores of labels that were not predicted, then you can explicitly specify the labels you are interested in (which are labels that were predicted at least once).
acc_metrics_pred = classification_report(y_true, y_pred, labels=np.unique(y_pred), digits=3)
print(acc_metrics_pred)
precision recall f1-score support
1111 0.835 0.831 0.833 22227
1121 0.693 0.773 0.731 6506
1152 0.672 0.857 0.753 5023
1171 0.280 0.710 0.402 286
1192 0.349 0.145 0.205 4591
1435 0.828 0.849 0.838 498
1511 0.849 0.591 0.697 4838
1771 0.921 0.868 0.894 1028
1811 0.829 0.927 0.876 5596
1911 0.000 0.000 0.000 151
1923 0.759 0.963 0.849 931
3199 0.950 0.970 0.960 17925
4111 0.143 0.041 0.063 49
6999 0.672 0.902 0.771 778
8111 0.506 0.980 0.667 512
8411 0.000 0.000 0.000 159
accuracy 0.809 71098
macro avg 0.580 0.651 0.596 71098
weighted avg 0.798 0.809 0.797 71098